Comment: Reading the dataset
library(readxl)
IMDB_data <- read_excel("~/Desktop/Datasets/IMDB data.xlsm")
View(IMDB_data)
Comments: Statistics
head(IMDB_data)
## # A tibble: 6 x 12
## Rank Title Genre Description Director Actors Year `Runtime (Minut…
## <dbl> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 1 Guar… Acti… A group of… James G… Chris… 2014 121
## 2 2 Prom… Adve… Following … Ridley … Noomi… 2012 124
## 3 3 Split Horr… Three girl… M. Nigh… James… 2016 117
## 4 4 Sing Anim… In a city … Christo… Matth… 2016 108
## 5 5 Suic… Acti… A secret g… David A… Will … 2016 123
## 6 6 The … Acti… European m… Yimou Z… Matt … 2016 103
## # ... with 4 more variables: Rating <dbl>, Votes <dbl>, `Revenue
## # (Millions)` <dbl>, Metascore <dbl>
typeof(IMDB_data)
## [1] "list"
str(IMDB_data)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1000 obs. of 12 variables:
## $ Rank : num 1 2 3 4 5 6 7 8 9 10 ...
## $ Title : chr "Guardians of the Galaxy" "Prometheus" "Split" "Sing" ...
## $ Genre : chr "Action,Adventure,Sci-Fi" "Adventure,Mystery,Sci-Fi" "Horror,Thriller" "Animation,Comedy,Family" ...
## $ Description : chr "A group of intergalactic criminals are forced to work together to stop a fanatical warrior from taking control "| __truncated__ "Following clues to the origin of mankind, a team finds a structure on a distant moon, but they soon realize the"| __truncated__ "Three girls are kidnapped by a man with a diagnosed 23 distinct personalities. They must try to escape before t"| __truncated__ "In a city of humanoid animals, a hustling theater impresario's attempt to save his theater with a singing compe"| __truncated__ ...
## $ Director : chr "James Gunn" "Ridley Scott" "M. Night Shyamalan" "Christophe Lourdelet" ...
## $ Actors : chr "Chris Pratt, Vin Diesel, Bradley Cooper, Zoe Saldana" "Noomi Rapace, Logan Marshall-Green, Michael Fassbender, Charlize Theron" "James McAvoy, Anya Taylor-Joy, Haley Lu Richardson, Jessica Sula" "Matthew McConaughey,Reese Witherspoon, Seth MacFarlane, Scarlett Johansson" ...
## $ Year : num 2014 2012 2016 2016 2016 ...
## $ Runtime (Minutes) : num 121 124 117 108 123 103 128 89 141 116 ...
## $ Rating : num 8.1 7 7.3 7.2 6.2 6.1 8.3 6.4 7.1 7 ...
## $ Votes : num 757074 485820 157606 60545 393727 ...
## $ Revenue (Millions): num 333 126 138 270 325 ...
## $ Metascore : num 76 65 62 59 40 42 93 71 78 41 ...
var(IMDB_data$Votes)
## [1] 35631337098
var(IMDB_data$`Runtime (Minutes)`)
## [1] 353.8503
var(IMDB_data$`Revenue (Millions)`)
## [1] NA
summary(IMDB_data)
## Rank Title Genre Description
## Min. : 1.0 Length:1000 Length:1000 Length:1000
## 1st Qu.: 250.8 Class :character Class :character Class :character
## Median : 500.5 Mode :character Mode :character Mode :character
## Mean : 500.5
## 3rd Qu.: 750.2
## Max. :1000.0
##
## Director Actors Year Runtime (Minutes)
## Length:1000 Length:1000 Min. :2006 Min. : 66.0
## Class :character Class :character 1st Qu.:2010 1st Qu.:100.0
## Mode :character Mode :character Median :2014 Median :111.0
## Mean :2013 Mean :113.2
## 3rd Qu.:2016 3rd Qu.:123.0
## Max. :2016 Max. :191.0
##
## Rating Votes Revenue (Millions) Metascore
## Min. :1.900 Min. : 61 Min. : 0.00 Min. : 11.00
## 1st Qu.:6.200 1st Qu.: 36309 1st Qu.: 13.27 1st Qu.: 47.00
## Median :6.800 Median : 110799 Median : 47.98 Median : 59.50
## Mean :6.723 Mean : 169808 Mean : 82.96 Mean : 58.99
## 3rd Qu.:7.400 3rd Qu.: 239910 3rd Qu.:113.72 3rd Qu.: 72.00
## Max. :9.000 Max. :1791916 Max. :936.63 Max. :100.00
## NA's :128 NA's :64
summary(IMDB_data$Year)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2006 2010 2014 2013 2016 2016
summary(IMDB_data$`Runtime (Minutes)`)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 66.0 100.0 111.0 113.2 123.0 191.0
summary(IMDB_data$Rating)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.900 6.200 6.800 6.723 7.400 9.000
summary(IMDB_data$Votes)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 61 36309 110799 169808 239910 1791916
summary(IMDB_data$`Revenue (Millions)`)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 13.27 47.98 82.96 113.72 936.63 128
summary(IMDB_data$Metascore)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 11.00 47.00 59.50 58.99 72.00 100.00 64
sum(is.na(IMDB_data))
## [1] 192
sum(is.na(IMDB_data$`Revenue (Millions)`))
## [1] 128
sum(is.na(IMDB_data$Metascore))
## [1] 64
Comment: Making Categories
votes_categories <- cut(IMDB_data$Votes, breaks = c(60, 37000, 120000, 240000, 1800000), labels = c("Low_Votes", "Medium_Votes", "High_Votes", "Highest_Votes"))
Rating_categories <- cut(IMDB_data$Rating, breaks = c(0, 4, 7, 10), labels = c("Low_Ratings", "Medium Ratings", "High_Ratings"))
Revenue_categories <- cut(IMDB_data$`Revenue (Millions)`, breaks = c(0, 47, 113, 940), labels = c("Low_Revenue", "Medium_Revenue", "High_Revenue"))
Metascore_categories <- cut(IMDB_data$Metascore, breaks = c(10, 46, 60, 72, 100), labels = c("Lowest_Metascore", "Low_Metascore", "Medium_Metascore", "High_Metascore"))
Comment: Binding into dataset
IMDB_categorized_data <- cbind(IMDB_data, votes_categories, Rating_categories, Revenue_categories, Metascore_categories)
Comments: Plotting
plot(IMDB_categorized_data$Rank[votes_categories == "Low_Votes"], IMDB_categorized_data$Rating[votes_categories == "Low_Votes"], main = "Rank vs Ratings 1", xlab = "Rank", ylab = "Ratings", col = "red", las=1)
Comments: It can be seen that a majority of movies lie in the rating range of 5-8. While on the other hand there is a large variation in rank of movies of approximately same rating. Perhaps we will have to explore the plot on more variables to find the reason for variation.
plot(IMDB_categorized_data$Rank[votes_categories == "Medium_Votes"], IMDB_categorized_data$Rating[votes_categories == "Medium_Votes"], main = "Rank vs Ratings 2", xlab = "Rank", ylab = "Ratings", las=1)
Comments: The plot is more compressed for medium_votes range as ratings on Y-axis vary from 6-8 for a majority of movies. While on the other hand the disparity of rank remains the same.
plot(IMDB_categorized_data$Rank[votes_categories == "High_Votes"], IMDB_categorized_data$Rating[votes_categories == "High_Votes"], main = "Rank vs Ratings 3", xlab = "Rank", ylab = "Ratings", las=1)
Comments: Like the plot for Medium_Votes, it can be seen that a majority of movies lie in the rating range of 6-8. While on the other hand there is a large variation in rank of movies of approximately same rating. Perhaps we will have to explore the plot on more variables to find the reason for variation.
plot(IMDB_categorized_data$Rank[votes_categories == "Highest_Votes"], IMDB_categorized_data$Rating[votes_categories == "Highest_Votes"], main = "Rank vs Ratings 4", xlab = "Rank", ylab = "Ratings", las=1)
Comments: The plot is more compressed for highest_votes range as ratings on Y-axis vary from 6.5-8 for a majority of movies. Also it can be seen that there is a huge concentration of movies from rank 100-450.
plot(IMDB_categorized_data$Rank[Revenue_categories == "Low_Revenue"], IMDB_categorized_data$Rating[Revenue_categories == "Low_Revenue"], main = "Rank vs Ratings 5", xlab = "Rank", ylab = "Ratings", las=1)
Comments: It can be seen that for low_revenue, the ratings vary for 5.5-8 for a majority of movies. But there is a vast divergence in terms of rank.
plot(IMDB_categorized_data$Rank[Revenue_categories == "Medium_Revenue"], IMDB_categorized_data$Rating[Revenue_categories == "Medium_Revenue"], main = "Rank vs Ratings 6", xlab = "Rank", ylab = "Ratings", las=1)
Comments: The rating vary from 6-8 from a majority of movies while the rank varies from 10-1000.
plot(IMDB_categorized_data$Rank[Revenue_categories == "High_Revenue"], IMDB_categorized_data$Rating[Revenue_categories == "High_Revenue"], main = "Rank vs Ratings 7", xlab = "Rank", ylab = "Ratings", las=1)
Comments: It can be seen that ratings vary from 6-8 and also there is a huge concentration of points from 0-400 rank. Hence we can conclude that movies which generate a high revenue generally have a rating above 6 and rank below 400 with some exceptions.
plot(IMDB_categorized_data$Rank[Metascore_categories == "Lowest_Metascore"], IMDB_categorized_data$Rating[Metascore_categories == "Lowest_Metascore"], main = "Rank vs Ratings 8", xlab = "Rank", ylab = "Ratings", las=1)
Comments: It can be seen that for lowest_metascore categories ratings usually vary in the range of 5-7, while the rank is usually above 350. Hence we can conclude that metascore depends on rank as well as ratings for lowest_metascore categories.
plot(IMDB_categorized_data$Rank[Metascore_categories == "Low_Metascore"], IMDB_categorized_data$Rating[Metascore_categories == "Low_Metascore"], main = "Rank vs Ratings 9", xlab = "Rank", ylab = "Ratings", las=1)
Comments: It can be seen that for low_metascore categories ratings usually vary in the range of 6-8, while the rank is varying from 10-1000.
plot(IMDB_categorized_data$Rank[Metascore_categories == "Medium_Metascore"], IMDB_categorized_data$Rating[Metascore_categories == "Medium_Metascore"], main = "Rank vs Ratings 10", xlab = "Rank", ylab = "Ratings", las=1)
Comments: It can be seen that for medium_metascore categories ratings usually vary in the range of 6.5-8, while the rank has a large variance.
plot(IMDB_categorized_data$Rank[Metascore_categories == "High_Metascore"], IMDB_categorized_data$Rating[Metascore_categories == "High_Metascore"], main = "Rank vs Ratings 11", xlab = "Rank", ylab = "Ratings", las=1)
Comments: It can be seen that for high_metascore categories ratings usually vary in the range of 7-8, while rank has a large variance.
plot(IMDB_data$Rank, IMDB_data$`Runtime (Minutes)`, main = "Rank vs Runtime", xlab = "Rank", ylab = "Runtime", las = 1)
Comments: It can be concluded that a majority of movies have a runtime between 90-130 minutes.
plot(IMDB_data$Rank, log(IMDB_data$Votes), main = "Rank vs Votes", xlab = "Rank", ylab = "Votes", las = 1)
Comments: It can be concluded that log(IMDB_data$votes) varies between 10-14 for a majority of movies that is there are movies of every rank category with a certain range in votes, so the number of votes cannot be a governing factor for rank.
plot(IMDB_categorized_data$`Revenue (Millions)`[Rating_categories == "Low_Ratings"], IMDB_categorized_data$Metascore[Rating_categories == "Low_Ratings"], main = "Revenue vs Metascore 1", xlab = "Revenue", ylab = "Metascore", las = 1)
Comments: It can be seen that in low_rating categories majority of movies generate a revenue below 20 million dollars. Moreover their metascore is also below 55.
plot(IMDB_categorized_data$`Revenue (Millions)`[Rating_categories == "High_Ratings"], IMDB_categorized_data$Metascore[Rating_categories == "High_Ratings"], main = "Revenue vs Metascore 2", xlab = "Revenue", ylab = "Metascore", las = 1)
Comments: It can be seen that for high_ratings category, movies have a metascore varying from 30 to 90 but a majority of movies have a generated revenue below 150 million dollars. So a high/low metascore in high rating categories cannot necessarily impy that it would earn above 150 million dollars.
plot(IMDB_categorized_data$Rating[Metascore_categories == "Low_Metascore"], IMDB_categorized_data$`Revenue (Millions)`[Metascore_categories == "Low_Metascore"], main = "Rating vs Revenue", xlab = "Rating", ylab = "Revenue", las = 1)
Comments: It can be seen that for a low_metascore category, ratings are usually in between 5.5-7.5, while revenue is below 100 million dollars. It can be hence concluded that revenue generation can definitely affect metascore for a movie.
plot(IMDB_categorized_data$Rating[Revenue_categories == "Low_Revenue"], IMDB_categorized_data$Metascore[Revenue_categories == "Low_Revenue"], main = "Rating vs Metascore", xlab = "Rating", ylab = 'Metacore', las = 1)
Comments: It can be said that as the ratings increase the metascore also increases for a majority of low_revenue category movies.
plot(IMDB_categorized_data$Rating, IMDB_categorized_data$`Runtime (Minutes)`, main = "Rating vs Runtime", xlab = "Rating", ylab = "Runtime", las = 1)
Comments: It can be seen that for as runtime increases, ratings also increase in a certain range for a majority of movies.
Comments: ggplots
library(ggplot2)
p1 <- ggplot(data = IMDB_data, aes(Rating))
p1 + geom_histogram(binwidth = 0.1) + labs(title = "Histogram for Rating")
p2 <- ggplot(data = IMDB_data, aes(Votes))
p2 + geom_histogram(binwidth = 50000) + labs(title = "Histogram for Votes")
p3 <- ggplot(data = IMDB_data, aes(Metascore))
p3 + geom_histogram(binwidth = 10) + labs(title = "Histogram for Metascore")
## Warning: Removed 64 rows containing non-finite values (stat_bin).
Comments: qqplots
qqnorm(IMDB_categorized_data$Rating[Revenue_categories == "Low_Revenue"])
qqline(IMDB_categorized_data$Rating[Revenue_categories == "Low_Revenue"], col = "red")
Comments: It can be seen that the data for ratings of low_revenue category movies have a normal distribution for a majority of dataset points.
qqplot(IMDB_categorized_data$Rating[Revenue_categories == "Low_Revenue"], IMDB_categorized_data$Metascore[Revenue_categories == "Low_Revenue"], xlab = "Ratings", ylab = "Metascore", col = "blue", las = 1)
Comments: It can be seen that qqplot for Ratings vs Metascore shows that set points have a normal distribution.